21231
MDSC-303 (Data Visualization)
Assignment
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.gridspec import GridSpec
import plotly.graph_objects as go
import plotly.express as px
from skimage import io
- for visualization libraries
#matplotlib.pyplot
TITLE_SIZE = 30
LABEL_SIZE=20
TITLE_COLOR = '#000000'
LABEL_COLOR = '#707070'
plt.rc('font',size=15)
plt.rc('axes',titlesize=TITLE_SIZE, titlecolor=TITLE_COLOR, titlepad=20)
plt.rc('axes',labelsize=LABEL_SIZE, labelcolor=LABEL_COLOR)
#plotly
custom_template = {
"layout": go.Layout(
font={
"family": "Nunito",
"size": LABEL_SIZE,
"color": LABEL_COLOR,
},
title={
"font": {
"family": "Lato",
"size": TITLE_SIZE,
"color": TITLE_COLOR,
},
},
plot_bgcolor="#ffffff",
paper_bgcolor="#ffffff",
colorway=px.colors.qualitative.G10,
)
}
plotly_config = {'displaylogo':False,
'displayModeBar':False,
# 'staticPlot':True,
}
The dataset contains the varied stats(attack, defense, special attacks,..,japanese name,pokedex_number,generation,..,capture_rate,abilities,etc) of pokemons till generation 7.
It is taken from kaggle: The Complete Pokemon Dataset
# df = pd.read_csv('./Dataset/pokemon.csv',encoding='utf-16-le', on_bad_lines='skip')
df = pd.read_csv('./Dataset/pokemon.csv')
df.head()
| abilities | against_bug | against_dark | against_dragon | against_electric | against_fairy | against_fight | against_fire | against_flying | against_ghost | ... | percentage_male | pokedex_number | sp_attack | sp_defense | speed | type1 | type2 | weight_kg | generation | is_legendary | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ['Overgrow', 'Chlorophyll'] | 1.0 | 1.0 | 1.0 | 0.5 | 0.5 | 0.5 | 2.0 | 2.0 | 1.0 | ... | 88.1 | 1 | 65 | 65 | 45 | grass | poison | 6.9 | 1 | 0 |
| 1 | ['Overgrow', 'Chlorophyll'] | 1.0 | 1.0 | 1.0 | 0.5 | 0.5 | 0.5 | 2.0 | 2.0 | 1.0 | ... | 88.1 | 2 | 80 | 80 | 60 | grass | poison | 13.0 | 1 | 0 |
| 2 | ['Overgrow', 'Chlorophyll'] | 1.0 | 1.0 | 1.0 | 0.5 | 0.5 | 0.5 | 2.0 | 2.0 | 1.0 | ... | 88.1 | 3 | 122 | 120 | 80 | grass | poison | 100.0 | 1 | 0 |
| 3 | ['Blaze', 'Solar Power'] | 0.5 | 1.0 | 1.0 | 1.0 | 0.5 | 1.0 | 0.5 | 1.0 | 1.0 | ... | 88.1 | 4 | 60 | 50 | 65 | fire | NaN | 8.5 | 1 | 0 |
| 4 | ['Blaze', 'Solar Power'] | 0.5 | 1.0 | 1.0 | 1.0 | 0.5 | 1.0 | 0.5 | 1.0 | 1.0 | ... | 88.1 | 5 | 80 | 65 | 80 | fire | NaN | 19.0 | 1 | 0 |
5 rows × 41 columns
We reorder them for better view.
main_attributes = ['name','type1','type2','hp','defense','sp_defense','attack','sp_attack','speed','generation','abilities','base_total','is_legendary']
main_attributes += list(set(df.columns).difference(set(main_attributes)))
df = df[main_attributes]
df.head()
| name | type1 | type2 | hp | defense | sp_defense | attack | sp_attack | speed | generation | ... | against_rock | against_fire | against_normal | against_dragon | against_psychic | against_steel | against_dark | height_m | weight_kg | percentage_male | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Bulbasaur | grass | poison | 45 | 49 | 65 | 49 | 65 | 45 | 1 | ... | 1.0 | 2.0 | 1.0 | 1.0 | 2.0 | 1.0 | 1.0 | 0.7 | 6.9 | 88.1 |
| 1 | Ivysaur | grass | poison | 60 | 63 | 80 | 62 | 80 | 60 | 1 | ... | 1.0 | 2.0 | 1.0 | 1.0 | 2.0 | 1.0 | 1.0 | 1.0 | 13.0 | 88.1 |
| 2 | Venusaur | grass | poison | 80 | 123 | 120 | 100 | 122 | 80 | 1 | ... | 1.0 | 2.0 | 1.0 | 1.0 | 2.0 | 1.0 | 1.0 | 2.0 | 100.0 | 88.1 |
| 3 | Charmander | fire | NaN | 39 | 43 | 50 | 52 | 60 | 65 | 1 | ... | 2.0 | 0.5 | 1.0 | 1.0 | 1.0 | 0.5 | 1.0 | 0.6 | 8.5 | 88.1 |
| 4 | Charmeleon | fire | NaN | 58 | 58 | 65 | 64 | 80 | 80 | 1 | ... | 2.0 | 0.5 | 1.0 | 1.0 | 1.0 | 0.5 | 1.0 | 1.1 | 19.0 | 88.1 |
5 rows × 41 columns
df.shape
(801, 41)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 801 entries, 0 to 800 Data columns (total 41 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 name 801 non-null object 1 type1 801 non-null object 2 type2 417 non-null object 3 hp 801 non-null int64 4 defense 801 non-null int64 5 sp_defense 801 non-null int64 6 attack 801 non-null int64 7 sp_attack 801 non-null int64 8 speed 801 non-null int64 9 generation 801 non-null int64 10 abilities 801 non-null object 11 base_total 801 non-null int64 12 is_legendary 801 non-null int64 13 against_bug 801 non-null float64 14 against_ground 801 non-null float64 15 classfication 801 non-null object 16 against_fairy 801 non-null float64 17 pokedex_number 801 non-null int64 18 capture_rate 801 non-null object 19 experience_growth 801 non-null int64 20 against_electric 801 non-null float64 21 against_flying 801 non-null float64 22 against_fight 801 non-null float64 23 against_grass 801 non-null float64 24 against_ghost 801 non-null float64 25 against_ice 801 non-null float64 26 japanese_name 801 non-null object 27 against_water 801 non-null float64 28 base_egg_steps 801 non-null int64 29 base_happiness 801 non-null int64 30 against_poison 801 non-null float64 31 against_rock 801 non-null float64 32 against_fire 801 non-null float64 33 against_normal 801 non-null float64 34 against_dragon 801 non-null float64 35 against_psychic 801 non-null float64 36 against_steel 801 non-null float64 37 against_dark 801 non-null float64 38 height_m 781 non-null float64 39 weight_kg 781 non-null float64 40 percentage_male 703 non-null float64 dtypes: float64(21), int64(13), object(7) memory usage: 256.7+ KB
df.duplicated().value_counts()
False 801 dtype: int64
Inference:
# correcting the 'classfication' typo in column name
df['classification'] = df.classfication
df = df.drop(columns=['classfication'])
# Filling the NULL values at type2 as None
df['type2'] = df.type2.fillna('None')
df.generation.unique()
array([1, 2, 3, 4, 5, 6, 7])
# Important Ones
imp = ['hp','attack','defense','speed','height_m','weight_kg','sp_attack','sp_defense']
df[imp].describe().transpose()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| hp | 801.0 | 68.958801 | 26.576015 | 1.0 | 50.0 | 65.0 | 80.0 | 255.0 |
| attack | 801.0 | 77.857678 | 32.158820 | 5.0 | 55.0 | 75.0 | 100.0 | 185.0 |
| defense | 801.0 | 73.008739 | 30.769159 | 5.0 | 50.0 | 70.0 | 90.0 | 230.0 |
| speed | 801.0 | 66.334582 | 28.907662 | 5.0 | 45.0 | 65.0 | 85.0 | 180.0 |
| height_m | 781.0 | 1.163892 | 1.080326 | 0.1 | 0.6 | 1.0 | 1.5 | 14.5 |
| weight_kg | 781.0 | 61.378105 | 109.354766 | 0.1 | 9.0 | 27.3 | 64.8 | 999.9 |
| sp_attack | 801.0 | 71.305868 | 32.353826 | 10.0 | 45.0 | 65.0 | 91.0 | 194.0 |
| sp_defense | 801.0 | 70.911361 | 27.942501 | 20.0 | 50.0 | 66.0 | 90.0 | 230.0 |
Inference:
We remove:
corr = df.drop(columns=['pokedex_number','base_total']).corr() #finding the correlations
c = abs(corr)>0.6 #condition for high correlation
d = abs(corr)<1
corr = corr.where(c&d) #taking highly correlated features
n = corr.shape[1]
NaNs = corr.columns[corr.isna().sum()==n].values #features with no high corr with any other features
corr = corr.drop( columns=NaNs) #dropping those features along both axis
corr = corr.drop(index=NaNs)
#plotting
plt.figure(figsize=(19,15))
sns.heatmap(corr, fmt='.2g', annot=True,linewidths=1, linecolor='lightgrey', cmap='rocket_r')
plt.title("(Only)High(abs(x)>0.6) Stats Correlations")
plt.xticks(size=15)
plt.yticks(size=15, rotation=0)
plt.show()
Inference:
Here only the highest correlations are shown as showing correlation for all would become too crowded.
Though from above correlation heatmap we see that there is no high correlation between the base stats.
#plotting
base_stats = ['hp','defense','attack','speed']
g = sns.pairplot(data=df, x_vars=base_stats, y_vars=base_stats, hue='is_legendary')
g.fig.suptitle('Pairplot of base stats',y=1.08, size=35)
plt.show()
Inference:
When we think of a pokemon first thing we see are attack and defense. So we want to investigate deeper in their relationship.
And the interactive graphs of plotly shows us exactly which point is which pokemon. Adding the pokemon names as labels to the points makes the visualization too much cluttered
tmp = df.copy()
tmp['is_legendary'] = df.is_legendary.astype(str)
fig = px.scatter(data_frame=tmp, x='attack',y='defense', color='is_legendary',
hover_data=['name','attack','defense'],
title="Attack vs Defense scatterplot",
height=800, width=800,
template=custom_template)
fig.show(config= plotly_config)
Inference:
Happiny is a Normal-type Baby Pokémon introduced in Generation IV. She is one among pokemons with highest base_happiness in the franchise.
#Displaying images of above highlight pokemons
images = ['Happiny','Heracross','Aggron','Steelix','Shuckle']
i=0
plt.figure(figsize=(20,10))
for image in images:
title = image
plt.subplot(1,5,i+1)
i+=1
im = io.imread('./Images/'+title.lower()+'.jpeg')
io.imshow(im)
plt.title(title, fontsize=25)
plt.axis('off')
plt.show()
plt.figure(figsize=(25,7))
plt.subplot(1,2,1)
sns.countplot(data = df, y='generation', color='#8D72E1')
plt.title('Pokemons Introduced per Generation', size=24)
plt.subplot(1,2,2)
sns.countplot(data = df, y='generation', hue='is_legendary', palette=['#636efa','#ff3f41'])
plt.title('Legendary & Non-Legendary Pokemons Introduced per Generation', size=24)
plt.show()
Inference:
Using bump chart we can see the Primary Types introduced in each generation.
Here they are ranked by the count of that type pokemon introduced in the certain generation.
#computing values
tmp = df.groupby(['generation'])['type1'].value_counts().sort_index() #getting the number of types for each generation
for i in range(1,8):
tmp[i] = (len(tmp[i]) - tmp[i].sort_values().argsort()).sort_index() #ranking the types for each generation; no overlapping ranks;
types = df.type1.unique()
(len(tmp[i]) - tmp[i].sort_values().argsort()).sort_index()
type1 bug 13 dark 1 dragon 5 electric 6 fairy 2 fighting 7 fire 11 ghost 8 grass 15 ground 3 normal 16 poison 9 psychic 12 rock 10 steel 4 water 14 Name: type1, dtype: int64
#plotting bump chart using the line plots in matplotlib.pyplot
from matplotlib.ticker import FixedFormatter, FixedLocator
fig, ax = plt.subplots(figsize=(20,10))
#secondary axis
yax2 = ax.secondary_yaxis("right")
yax2.yaxis.set_major_locator(FixedLocator(tmp[7]))
yax2.yaxis.set_major_formatter(FixedFormatter(tmp[7].index))
#plotting the lines
for t in types:
x = tmp[:,t].sort_index().index.values
y = tmp[:,t]
# x = np.arange len(y)
ax.plot(x,y,'o-', mfc='w', label=t)
#setting support markers
ax.set_yticks(np.arange(1,18), size=15)
ax.grid()
ax.set(xlabel='Generation',ylabel='Rank',title='Popularity of Primary type by Generation')
ax.invert_yaxis() #make top as first
plt.legend(loc=(1.06,0))
plt.show()
Inference:
#computing values
tmp = pd.DataFrame(df.type1.value_counts())
tmp['type']='Type-1'
tmp.columns = ['count','type']
tmp2 = pd.DataFrame(df.type2.value_counts())
tmp2['type']='Type-2'
tmp2.columns = ['count','type']
tmp12 = pd.concat([tmp,tmp2])
tmp12 = tmp12.reset_index()
tmp12.columns = ['Type','Count','Type-class']
#plotting
fig = px.histogram( data_frame=tmp12, x='Count',y='Type',color='Type-class',
barmode='group', height=700, width=1400,
title='Type-1 and Type-2 class',
labels={'sum of Count':'Count'},
template=custom_template)
fig.update_yaxes(categoryorder='trace')
fig.layout
fig.show()
Inference:
attributes = ['attack', 'defense', 'hp', 'sp_attack', 'sp_defense', 'speed']
colors = ['#d62728','#9467bd','#2da12e','#f68318','#8d574c','#2e77b4']
gh = df[df.type1=='ghost'].describe()[attributes].loc['mean'].sort_index()
da = df[df.type1=='dark'].describe()[attributes].loc['mean'].sort_index()
lines = list(zip(gh,da)) #zipping to get (x,y) pairs
plt.figure(figsize=(7,10))
#plotting the lines
i=0
for line in lines:
plt.plot([1,2],line,'o-', mfc='w', label=attributes[i],color=colors[i])
i+=1
plt.gca().margins(x=0.8) #the ghost & dark vertical lines
plt.xticks([1,2],['Ghost','Dark'], size=15)
plt.grid(axis='x')
plt.legend(loc=(1.04,0.5))
plt.yticks([])
plt.title('Ghost vs Dark Stats')
plt.show()
Inference:
Pokemons with stats above 80% quantile
features = ['attack','defense','speed','hp','sp_attack','sp_defense'] #above average for these features
#getting the limits; need to get the global average
lims={}
for f in features:
lims[f]=df[f].median()
lims = df.quantile(.80)[features]
vals1 = []
tmp1 = df[df.is_legendary==1] #just for legendary
for f in features:
tmp1 = tmp1[tmp1[f]>lims[f]]
vals1.append(len( tmp1))
vals2 = []
tmp2 = df[df.is_legendary==0] #just for non legendary
for f in features:
tmp2 = tmp2[tmp2[f]>lims[f]]
vals2.append(len( tmp2))
fig = go.Figure()
fig.add_trace(go.Funnel(y=features, x=vals1, name='Legendary',marker={'color':'#ff3f41'}))
fig.add_trace(go.Funnel(y=features, x=vals2, name='Non-Legendary',marker={'color':'#636efa'}))
fig.update_layout(title="Filtering Best Pokemons Funnel Chart")
fig.show()
Inference:
print("The 3 pokemons:",tmp1.name.values)
The 3 pokemons: ['Rayquaza' 'Palkia' 'Arceus']
Though as per the design principles Multiple grouped bar charts, parallel coordinates plot,etc are preferred over Radar charts on the basis that Radar chart distorts the perception as actual comparision is based upon the points values but Radar charts gives perception of area of the polygon formed thus giving sideway view.
But gaming community is very much acquainted and used to Radar charts as almost every online game compares stats via Radar charts. And Pokemon franchise was started as and is still a gaming industry more compard to the media and entertainment sectors of it.
Thus here we chose the Radar charts.
names = ['Rayquaza','Palkia','Arceus','Pikachu'] #Pokemons to compare
fillColors = ['green','red','blue','yellow']
edgeColors = ['darkgreen','darkred','darkblue','yellow']
attributes = ['attack','sp_attack','defense','sp_defense','hp','speed'] #stats that we compare upon
radars = [] #list of pokemon stats
for name in names:
radars.append(df[df.name==name][attributes].values.tolist()[0]) #getting their stats
angles = np.linspace(0,2*np.pi, len(attributes),endpoint=False) #polar lines of chart
angles=np.concatenate((angles,[angles[0]]))
attributes.append(attributes[0]) #completing the polygon radar
for radar in radars:
radar.append(radar[0])
#START PLOTTING
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111,polar=True) #initiating polar chart
i=0
for radar in radars: #plotting the pokemon stats on charts
ax.plot(angles, radar,'o-',color=edgeColors[i], linewidth=1,label=names[i])
ax.fill(angles, radar, alpha=0.09, color=fillColors[i])
i+=1
ax.set_thetagrids(angles[:-1] * 180/np.pi, attributes[:-1], fontsize=12) #adding the stats-labels to the angles
ax.grid(True)
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles,names, loc=(0.99,0.80))
title = ' vs '.join(names)
ax.set_title(title, pad=40)
plt.show()
Inference:
names = ['Giratina','Palkia','Dialga'] #Pokemons to compare
fillColors = ['red','blue','green']
edgeColors = ['darkred','darkblue','green']
attributes = ['attack','sp_attack','defense','sp_defense','hp','speed'] #stats that we compare upon
radars = [] #list of pokemon stats
for name in names:
radars.append(df[df.name==name][attributes].values.tolist()[0]) #getting their stats
angles = np.linspace(0,2*np.pi, len(attributes),endpoint=False) #polar lines of chart
angles=np.concatenate((angles,[angles[0]]))
attributes.append(attributes[0]) #completing the polygon radar
for radar in radars:
radar.append(radar[0])
#START PLOTTING
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111,polar=True) #initiating polar chart
i=0
for radar in radars: #plotting the pokemon stats on charts
ax.plot(angles, radar,'o-',color=edgeColors[i], linewidth=1,label=names[i])
ax.fill(angles, radar, alpha=0.09, color=fillColors[i])
i+=1
ax.set_thetagrids(angles[:-1] * 180/np.pi, attributes[:-1], fontsize=12)#adding the stats-labels to the angles
plt.grid(True)
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles,names, loc=(0.99,0.80))
title = ' vs '.join(names)
ax.set_title(title, pad=40)
plt.show()
Inference:
abilities_list = df.abilities.apply(lambda x: list(x.replace('[\'','').replace('\', \'',',').replace('\']','').split(',')))
unique_abilities = list(set(abilities_list.aggregate('sum')))
print("Total Unique Abilities:",len(unique_abilities))
tmp = pd.Series(abilities_list.aggregate('sum')).value_counts().sort_values(ascending=False)[:15]
pd.DataFrame(tmp, columns=['Occurence'])
Total Unique Abilities: 227
| Occurence | |
|---|---|
| Sturdy | 41 |
| Swift Swim | 38 |
| Keen Eye | 37 |
| Chlorophyll | 35 |
| Levitate | 33 |
| Inner Focus | 32 |
| Intimidate | 31 |
| Swarm | 25 |
| Sheer Force | 25 |
| Pressure | 25 |
| Sand Veil | 24 |
| Run Away | 24 |
| Oblivious | 24 |
| Thick Fat | 23 |
| Overgrow | 23 |
Inference: